library(caret)
library(lme4)
library(lmerTest)
library(ggplot2)
library(stringi)
library(gridExtra)
library(ggfortify)
library(dendextend)
library(psych)
library(kableExtra)
library(tidyverse)
library(dtplyr)
source('other_functions.R')
source('plotting_functions.R')
# set load_outputdata=TRUE if you want to skip long computations (normalization and DEA) and just load the previously saved data sets
load_outputdata=TRUE
data.list <- readRDS('input_data.rds')
dat.l <- data.list$dat.l # data in long format
# which proteins were spiked in?
spiked.proteins <- dat.l %>% distinct(Protein) %>% filter(stri_detect(Protein, fixed='ups')) %>% pull %>% as.character
#TEMP -remove!
# tmp=dat.l %>% distinct(Protein) %>% pull %>% as.character
# dat.l <- dat.l %>% filter(Protein %in% c(tmp[sample(1:length(tmp), size=20)], spiked.proteins))
# specify # of varying component variants and their names
variant.names <- c('log2_intensity', 'intensity', 'ratio')
n.comp.variants <- length(variant.names)
scale.vec <- c('log', 'raw', 'raw') # ratios are considered raw, because they are basically mean-normalized intensities
# pick reference condition for making plots / doing DEA
referenceCondition <- '0.5'
# specify colours corresponding to biological conditions
condition.color <- tribble(
  ~Condition, ~Color,
  "0.125", 'black',
  "0.5", 'blue',
  "0.667", 'green',
  "1", 'red' )
# create data frame with sample info (distinct Run,Channel, Condition, Color)
sample.info <- get_sample_info(dat.l, condition.color)
channelNames <- remove_factors(unique(sample.info$Channel))

1 Unit scale component

dat.unit.l <- emptyList(variant.names)

1.1 log2 transformation of reporter ion intensities

dat.unit.l$log2_intensity <- dat.l %>% mutate(response=log2(intensity)) %>% select(-intensity)

1.2 intensities on the original scale

dat.unit.l$intensity <- dat.l %>% rename(response=intensity)

1.3 intensity ratios

Calculate ratio (per PSM) with respect to average intensity within run, or in other words: each value is divided by the row mean.

# use half-wide data to compute within-run average of PSM channels corresponding to the reference Condition
refCols <- sample.info %>% distinct(Channel) %>% pull(Channel)
denom.df=dat.l %>% pivot_wider(id_cols=-one_of('Condition', 'BioReplicate'),names_from='Channel', values_from='intensity')
denom.df$denom=apply(denom.df[,refCols], 1, function(x) mean(x, na.rm=T))
denom.df=denom.df[,c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM', 'denom')]
dat.unit.l$ratio <- dat.l %>% left_join(denom.df[c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM', 'denom')], by=c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM')) %>% mutate(response=intensity/denom) %>% select(-c(intensity, denom)) 

2 Summarization component: no summarization

# no summarization 
dat.summ.l <- dat.unit.l

3 Normalization component: linear mixed model

if (!load_outputdata){
dat.norm.l <- dat.summ.l
# fit normalization model
norm.models <- lapply(dat.summ.l, function(x) return(lmer(response ~ Mixture + Mixture:TechRepMixture + Mixture:TechRepMixture:Channel + (1|Protein) + (1|Mixture:TechRepMixture:Peptide), data=x)))
# assign normalized values
for (i in 1:n.comp.variants){ dat.norm.l[[variant.names[i]]]$response <- residuals(norm.models[[variant.names[i]]]) }
# apply the 'fix' - add the model intercept to the residuals
dat.norm.l$intensity_fix <- dat.norm.l$intensity; dat.norm.l$intensity_fix$response <- dat.norm.l$intensity_fix$response + fixef(norm.models$intensity)['(Intercept)']
dat.norm.l$ratio_fix <- dat.norm.l$ratio; dat.norm.l$ratio_fix$response <- dat.norm.l$ratio_fix$response + fixef(norm.models$ratio)['(Intercept)']
# update variant names, #, scale
variant.names <- names(dat.norm.l)
scale.vec <- c(scale.vec, 'raw', 'raw')
n.comp.variants <- length(variant.names)
tmp <- c("log2_intensity", "intensity", "intensity_fix", "ratio", "ratio_fix")
variant.names <- tmp
dat.norm.l <- dat.norm.l[tmp]
# for technical reasons, assign new variants to dat.summ.l
dat.summ.l$intensity_fix <- dat.summ.l$intensity
dat.summ.l$ratio_fix <- dat.summ.l$ratio
dat.summ.l <- dat.summ.l[tmp]
}

4 QC plots

if (!load_outputdata){
dat.nonnorm.summ.l <- lapply(dat.summ.l, function(x) aggFunc(x, 'response', group.vars=c('Mixture', 'TechRepMixture', 'Run', 'Channel', 'Condition', 'BioReplicate', 'Protein', 'Peptide'), 'median')) 
dat.nonnorm.summ.l <- lapply(dat.nonnorm.summ.l, function(x) aggFunc(x, 'response', group.vars=c('Mixture', 'TechRepMixture', 'Run', 'Channel', 'Condition', 'BioReplicate', 'Protein'), 'median'))

dat.norm.summ.l <- lapply(dat.norm.l, function(x) aggFunc(x, 'response', group.vars=c('Mixture', 'TechRepMixture', 'Run', 'Channel', 'Condition', 'BioReplicate', 'Protein', 'Peptide'), 'median')) 
dat.norm.summ.l <- lapply(dat.norm.summ.l, function(x) aggFunc(x, 'response', group.vars=c('Mixture', 'TechRepMixture', 'Run', 'Channel', 'Condition', 'BioReplicate', 'Protein'), 'median')) 

# make data completely wide (also across runs)
## normalized data
dat.norm.summ.w2 <- lapply(dat.norm.summ.l, function(x){
  dat.tmp <- pivot_wider(data=x, id_cols=Protein, names_from=Run:Channel, values_from=response, names_sep=':')
  return(dat.tmp)})

## non-normalized data
dat.nonnorm.summ.w2 <- lapply(dat.nonnorm.summ.l, function(x){
  dat.tmp <- pivot_wider(data=x, id_cols=Protein, names_from=Run:Channel, values_from=response, names_sep=':')
  return(dat.tmp)})
}

4.1 Boxplot

par(mfrow=c(1,2))
for (i in 1:n.comp.variants){
  boxplot_ils(dat.nonnorm.summ.l[[variant.names[i]]], paste('raw', variant.names[i], sep='_'))
  boxplot_ils(dat.norm.summ.l[[variant.names[i]]], paste('normalized', variant.names[i], sep='_'))}

4.2 MA plots

MA plots of two single samples taken from condition 1 and condition 0.125, measured in different MS runs (samples Mixture2_1:127C and Mixture1_2:129N, respectively).

for (i in 1:n.comp.variants){
  p1 <- maplot_ils(dat.nonnorm.summ.w2[[variant.names[i]]], 'Mixture2_1:127C', 'Mixture1_2:129N', scale.vec[i], paste('raw', variant.names[i], sep='_'), spiked.proteins)
  p2 <- maplot_ils(dat.norm.summ.w2[[variant.names[i]]], 'Mixture2_1:127C', 'Mixture1_2:129N', scale.vec[i], paste('normalized', variant.names[i], sep='_'), spiked.proteins)
  grid.arrange(p1, p2, ncol=2)}  

MA plots of all samples from condition 1 and condition 0.125 (quantification values averaged within condition).

channels.num <- sample.info %>% filter(Condition=='1') %>% distinct(Run:Channel) %>% pull
channels.denom <- sample.info %>% filter(Condition=='0.125') %>% distinct(Run:Channel) %>% pull
for (i in 1:n.comp.variants){
  p1 <- maplot_ils(dat.nonnorm.summ.w2[[variant.names[i]]], channels.num, channels.denom, scale=scale.vec[i], paste('raw', variant.names[i], sep='_'), spiked.proteins)
  p2 <- maplot_ils(dat.norm.summ.w2[[variant.names[i]]], channels.num, channels.denom, scale=scale.vec[i], paste('normalized', variant.names[i], sep='_'), spiked.proteins)
grid.arrange(p1, p2, ncol=2)}

4.3 PCA plots

4.3.1 Using all proteins

par(mfrow=c(1, 2))
for (i in 1:n.comp.variants){
  pca.scale=FALSE
  if (variant.names[i] %in% c('intensity', 'intensity_fix')) pca.scale=TRUE
  pcaplot_ils(dat.nonnorm.summ.w2[[variant.names[i]]] %>% select(-'Protein'), info=sample.info, paste('raw', variant.names[i], sep='_'), scale=pca.scale)
  pcaplot_ils(dat.norm.summ.w2[[variant.names[i]]] %>% select(-'Protein'), info=sample.info, paste('normalized', variant.names[i], sep='_'), scale=pca.scale)}

4.3.2 Using spiked proteins only

par(mfrow=c(1, 2))
for (i in 1:n.comp.variants){
  pca.scale=FALSE
  if (variant.names[i] %in% c('intensity', 'intensity_fix')) pca.scale=TRUE
  pcaplot_ils(dat.nonnorm.summ.w2[[variant.names[i]]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, paste('raw', variant.names[i], sep='_'), scale=pca.scale)
  pcaplot_ils(dat.norm.summ.w2[[variant.names[i]]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, paste('normalized', variant.names[i], sep='_'), scale=pca.scale)}

HC (hierarchical clustering) plots

4.3.3 Using all proteins

par(mfrow=c(1,2))
for (i in 1:n.comp.variants){
  dendrogram_ils(dat.nonnorm.summ.w2[[variant.names[i]]] %>% select(-'Protein'), info=sample.info, paste('raw', variant.names[i], sep='_'))
  dendrogram_ils(dat.norm.summ.w2[[variant.names[i]]] %>% select(-'Protein'), info=sample.info, paste('normalized', variant.names[i], sep='_'))}

4.4 Run effect p-value plot

Below the histograms of p-values from the linear model explaining the response variable with Run as a covariate.

plots <- vector('list', n.comp.variants)
for (i in 1:n.comp.variants){
dat <- list(dat.nonnorm.summ.l[[variant.names[i]]], dat.norm.summ.l[[variant.names[i]]])
names(dat) <- c(paste('raw', variant.names[i], sep='_'), paste('normalized', variant.names[i], sep='_'))
plots[[i]] <- run_effect_plot(dat)}
grid.arrange(grobs = plots, nrow=n.comp.variants)

5 DEA component: linear mixed model

Intra-protein correlation modelled with 1|Run:Channel random effect.

if (!load_outputdata){
dat.dea <- emptyList(variant.names)
for(i in seq_along(dat.dea)){
  dat.dea[[variant.names[i]]] <- lmm_dea(dat=dat.norm.l[[variant.names[i]]], mod.formula='response ~ Condition + (1|Run:Channel)', scale=scale.vec[i], referenceCondition)}
}

# character vectors containing logFC and p-values columns
dea.cols <- colnames(dat.dea[[1]])
logFC.cols <- dea.cols[stri_detect_fixed(dea.cols, 'logFC')]
significance.cols <- dea.cols[stri_detect_fixed(dea.cols, 'q.mod')]
n.contrasts <- length(logFC.cols)

6 Results comparison

6.1 Confusion matrix

cm <- conf_mat(dat.dea, 'q.mod', 0.05, spiked.proteins)
print_conf_mat(cm, referenceCondition)
0.125 vs 0.5 contrast
log2_intensity
intensity
intensity_fix
ratio
ratio_fix
background spiked background spiked background spiked background spiked background spiked
not_DE 4060 4 4015 13 4016 13 4060 5 4060 5
DE 3 15 48 6 48 6 4 14 4 14
0.125 vs 0.5 contrast
log2_intensity intensity intensity_fix ratio ratio_fix
Accuracy 0.998 0.985 0.985 0.998 0.998
Sensitivity 0.789 0.316 0.316 0.737 0.737
Specificity 0.999 0.988 0.988 0.999 0.999
PPV 0.833 0.111 0.111 0.778 0.778
NPV 0.999 0.997 0.997 0.999 0.999
0.667 vs 0.5 contrast
log2_intensity
intensity
intensity_fix
ratio
ratio_fix
background spiked background spiked background spiked background spiked background spiked
not_DE 4063 17 4020 19 4021 19 4062 15 4062 15
DE 0 2 43 0 43 0 2 4 2 4
0.667 vs 0.5 contrast
log2_intensity intensity intensity_fix ratio ratio_fix
Accuracy 0.996 0.985 0.985 0.996 0.996
Sensitivity 0.105 0.000 0.000 0.211 0.211
Specificity 1.000 0.989 0.989 1.000 1.000
PPV 1.000 0.000 0.000 0.667 0.667
NPV 0.996 0.995 0.995 0.996 0.996
1 vs 0.5 contrast
log2_intensity
intensity
intensity_fix
ratio
ratio_fix
background spiked background spiked background spiked background spiked background spiked
not_DE 4060 4 4061 10 4062 10 4059 4 4059 4
DE 3 15 2 9 2 9 5 15 5 15
1 vs 0.5 contrast
log2_intensity intensity intensity_fix ratio ratio_fix
Accuracy 0.998 0.997 0.997 0.998 0.998
Sensitivity 0.789 0.474 0.474 0.789 0.789
Specificity 0.999 1.000 1.000 0.999 0.999
PPV 0.833 0.818 0.818 0.750 0.750
NPV 0.999 0.998 0.998 0.999 0.999

6.2 Scatter plots

scatterplot_ils(dat.dea, significance.cols, 'q-values', spiked.proteins)

scatterplot_ils(dat.dea, logFC.cols, 'log2FC', spiked.proteins)

6.3 Volcano plots

for (i in 1:n.contrasts){
  volcanoplot_ils(dat.dea, i, spiked.proteins)}

6.4 Violin plots

Let’s see whether the spiked protein fold changes make sense

# plot theoretical value (horizontal lines) and violin per variant
violinplot_ils(lapply(dat.dea, function(x) x[spiked.proteins, logFC.cols]), referenceCondition)

7 Conclusions

8 Session information

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252  LC_CTYPE=English_United Kingdom.1252   
## [3] LC_MONETARY=English_United Kingdom.1252 LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] dtplyr_1.0.1      forcats_0.5.0     stringr_1.4.0     dplyr_1.0.2       purrr_0.3.4      
##  [6] readr_1.4.0       tidyr_1.1.2       tibble_3.0.4      tidyverse_1.3.0   kableExtra_1.3.1 
## [11] psych_2.0.9       dendextend_1.14.0 ggfortify_0.4.11  gridExtra_2.3     stringi_1.5.3    
## [16] lmerTest_3.1-3    lme4_1.1-25       Matrix_1.2-18     caret_6.0-86      ggplot2_3.3.2    
## [21] lattice_0.20-41   rmarkdown_2.5    
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1          backports_1.1.10      plyr_1.8.6            splines_4.0.3        
##   [5] dplR_1.7.1            BiocParallel_1.22.0   TH.data_1.0-10        digest_0.6.26        
##   [9] foreach_1.5.1         htmltools_0.5.0       viridis_0.5.1         fansi_0.4.1          
##  [13] ROTS_1.16.0           magrittr_1.5          doParallel_1.0.16     limma_3.44.3         
##  [17] recipes_0.1.15        modelr_0.1.8          gower_0.2.2           matrixStats_0.57.0   
##  [21] R.utils_2.10.1        sandwich_3.0-0        colorspace_1.4-1      signal_0.7-6         
##  [25] rvest_0.3.6           JGmisc_0.3            haven_2.3.1           xfun_0.19            
##  [29] crayon_1.3.4          jsonlite_1.7.1        libcoin_1.0-6         impute_1.62.0        
##  [33] survival_3.2-7        zoo_1.8-8             iterators_1.0.13      glue_1.4.2           
##  [37] gtable_0.3.0          ipred_0.9-9           zlibbioc_1.34.0       webshot_0.5.2        
##  [41] NOMAD_0.99.0          BiocGenerics_0.34.0   scales_1.1.1          vsn_3.56.0           
##  [45] mvtnorm_1.1-1         DBI_1.1.0             Rcpp_1.0.5            mzR_2.22.0           
##  [49] viridisLite_0.3.0     tmvnsim_1.0-2         CONSTANd_0.99.0       preprocessCore_1.50.0
##  [53] matrixTests_0.1.9     stats4_4.0.3          lava_1.6.8.1          prodlim_2019.11.13   
##  [57] httr_1.4.2            ellipsis_0.3.1        modeltools_0.2-23     farver_2.0.3         
##  [61] pkgconfig_2.0.3       XML_3.99-0.5          R.methodsS3_1.8.1     nnet_7.3-14          
##  [65] dbplyr_2.0.0          utf8_1.1.4            labeling_0.4.2        tidyselect_1.1.0     
##  [69] rlang_0.4.8           reshape2_1.4.4        munsell_0.5.0         cellranger_1.1.0     
##  [73] tools_4.0.3           cli_2.1.0             generics_0.1.0        broom_0.7.2          
##  [77] evaluate_0.14         mzID_1.26.0           yaml_2.2.1            ModelMetrics_1.2.2.2 
##  [81] knitr_1.30            fs_1.5.0              admisc_0.11           ncdf4_1.17           
##  [85] coin_1.3-1            DEqMS_1.6.0           nlme_3.1-149          R.oo_1.24.0          
##  [89] venn_1.9              xml2_1.3.2            compiler_4.0.3        rstudioapi_0.12      
##  [93] png_0.1-7             e1071_1.7-4           affyio_1.58.0         reprex_0.3.0         
##  [97] statmod_1.4.35        highr_0.8             MSnbase_2.14.2        ProtGenerics_1.20.0  
## [101] nloptr_1.2.2.2        vctrs_0.3.4           pillar_1.4.6          lifecycle_0.2.0      
## [105] BiocManager_1.30.10   MALDIquant_1.19.3     data.table_1.13.2     R6_2.5.0             
## [109] pcaMethods_1.80.0     affy_1.66.0           IRanges_2.22.2        codetools_0.2-16     
## [113] boot_1.3-25           MASS_7.3-53           assertthat_0.2.1      withr_2.3.0          
## [117] mnormt_2.0.2          multcomp_1.4-14       S4Vectors_0.26.1      mgcv_1.8-33          
## [121] parallel_4.0.3        hms_0.5.3             grid_4.0.3            rpart_4.1-15         
## [125] timeDate_3043.102     minqa_1.2.4           class_7.3-17          pROC_1.16.2          
## [129] numDeriv_2016.8-1.1   Biobase_2.48.0        lubridate_1.7.9